import pandas as pd
import numpy as np
# import data
dt = pd.read_csv("data.csv", sep=";")
dt.head()
we have data redundancies (mois)
dt.shape
dt.info()
dt.mean()
# import covide data
covid19_tn = pd.read_csv("covid19_tn.csv", sep=";")
covid19_tn["date"][1]
we want change the format to yyyy-mm like our data.csv
for i in range(len(covid19_tn)):
covid19_tn["date"][i]=covid19_tn["date"][i][3:]
covid19_tn["date"][i]=covid19_tn["date"][i].replace("/",'-')
covid19_tn["date"][i]=covid19_tn["date"][i][3:]+"-"+covid19_tn["date"][i][:2]
#covid19_tn.to_csv("covid19_tn_For_model.csv",index=False)
covid19_tn.head()
covid19_tn1=covid19_tn.groupby('date').max()
covid19_tn1.reset_index(inplace=True)
covid19_tn1 = covid19_tn1.rename(columns = {'index':'date'})
covid19_tn1=covid19_tn1.rename(columns = {'date': 'mois'}, inplace = False)
covid19_tn1
covid19_tn1.info()
covid19_tn1['mois']=covid19_tn1['mois'].apply(str)
import seaborn as sns
import matplotlib.pyplot as plt
#g1 = plt.subplot(212)
g1 = sns.pointplot(x="mois", y ="case",data=covid19_tn1)
g1.set_xlabel("mois", fontsize=12)
g1.set_ylabel("case", fontsize=12)
g1.set_title("covide-16 tunisie", fontsize=20)
plt.subplots_adjust(wspace = 0.5, hspace = 0.5,top = 1)
plt.show()
import plotly.graph_objs as go
import plotly.graph_objs as go
import plotly.tools as tls # It's useful to we get some tools of plotly
#covid19_tn1['mois'] = list(covid19_tn1.index)
fig = go.Figure()
fig.add_trace(go.Scatter(x=covid19_tn1['mois'], y=covid19_tn1['case'],
mode='lines+markers',name='Total Cases'))
fig.update_layout(title_text='Confirmed cases in Tunisia each MONTH',plot_bgcolor='rgb(250, 242, 242)')
fig.show()
we have to sum the "quantites_reelles" group by mois
df=dt.groupby('mois').sum()
df.reset_index(inplace=True)
df = df.rename(columns = {'index':'mois'})
df
merging the two data
df_finale=df.merge(covid19_tn1, on = ['mois'],how = 'outer')
df_finale
df_finale.to_csv("df_finale.csv",index=False)
df.merge(covid19_tn1, on = ['mois'])
Using machine learning techniques and with this dataset it's impossible to make a prediction here we can use GANs algorithm to generate more data ,but let's imagine that we have a big dataset and let's make some work
I don't want to add random values in quantity because they will augment the loss then i choose to ignore the column because i don't have A real data
We will use covid19_tn to make a predictions
covid19_tn_model = pd.read_csv("covid19_tn.csv", sep=";")
covid19_tn_model
import plotly.express as px
fig=px.bar(x=covid19_tn_model["date"],y=covid19_tn_model["case"])
fig.update_layout(title="Distribution of Number of Cases",
xaxis_title="Date",yaxis_title="Number of Cases")
fig.show()
fig=go.Figure()
fig.add_trace(go.Scatter(x=covid19_tn_model["date"],y=covid19_tn_model["case"],
mode='lines+markers',
name='Confirmed Cases'))
fig.update_layout(title="Number cases",
xaxis_title="Date",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error,r2_score
train_ml=covid19_tn_model.iloc[:int(covid19_tn_model.shape[0]*0.95)]
valid_ml=covid19_tn_model.iloc[int(covid19_tn_model.shape[0]*0.95):]
model_scores=[]
lin_reg=LinearRegression(normalize=True)
lin_reg.fit(np.array(train_ml.index).reshape(-1,1),np.array(train_ml["case"]).reshape(-1,1))
prediction_valid_linreg=lin_reg.predict(np.array(valid_ml.index).reshape(-1,1))
model_scores.append(np.sqrt(mean_squared_error(valid_ml["case"],prediction_valid_linreg)))
print("Root Mean Square Error for Linear Regression: ",np.sqrt(mean_squared_error(valid_ml["case"],prediction_valid_linreg)))
plt.figure(figsize=(11,6))
prediction_linreg=lin_reg.predict(np.array(covid19_tn_model.index).reshape(-1,1))
linreg_output=[]
for i in range(prediction_linreg.shape[0]):
linreg_output.append(prediction_linreg[i][0])
fig=go.Figure()
fig.add_trace(go.Scatter(x=covid19_tn_model["date"],y=covid19_tn_model["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=covid19_tn_model["date"], y=linreg_output,
mode='lines',name="Linear Regression Best Fit Line",
line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Linear Regression Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
The Linear Regression Model is absolutely falling aprat. As it is clearly visible that the trend of Confirmed Cases in absolutely not Linear.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 8)
train_ml=covid19_tn_model.iloc[:int(covid19_tn_model.shape[0]*0.95)]
valid_ml=covid19_tn_model.iloc[int(covid19_tn_model.shape[0]*0.95):]
model_scores=[]
train_poly=poly.fit_transform(np.array(train_ml.index).reshape(-1,1))
valid_poly=poly.fit_transform(np.array(valid_ml.index).reshape(-1,1))
y=train_ml["case"]
linreg=LinearRegression(normalize=True)
linreg.fit(train_poly,y)
prediction_poly=linreg.predict(valid_poly)
rmse_poly=np.sqrt(mean_squared_error(valid_ml["case"],prediction_poly))
model_scores.append(rmse_poly)
print("Root Mean Squared Error for Polynomial Regression: ",rmse_poly)
comp_data=poly.fit_transform(np.array(covid19_tn_model.index).reshape(-1,1))
plt.figure(figsize=(11,6))
predictions_poly=linreg.predict(comp_data)
fig=go.Figure()
fig.add_trace(go.Scatter(x=covid19_tn_model["date"], y=covid19_tn_model["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=covid19_tn_model["date"], y=predictions_poly,
mode='lines',name="Polynomial Regression Best Fit",
line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Polynomial Regression Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",
legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
i dont like this model
from sklearn.svm import SVR
train_ml=covid19_tn_model.iloc[:int(covid19_tn_model.shape[0]*0.95)]
valid_ml=covid19_tn_model.iloc[int(covid19_tn_model.shape[0]*0.95):]
model_scores=[]
#Intializing SVR Model
svm=SVR(C=1,degree=6,kernel='poly',epsilon=0.01)
#Fitting model on the training data
svm.fit(np.array(train_ml.index).reshape(-1,1),np.array(train_ml["case"]).reshape(-1,1))
prediction_valid_svm=svm.predict(np.array(valid_ml.index).reshape(-1,1))
model_scores.append(np.sqrt(mean_squared_error(valid_ml["case"],prediction_valid_svm)))
print("Root Mean Square Error for Support Vectore Machine: ",np.sqrt(mean_squared_error(valid_ml["case"],prediction_valid_svm)))
plt.figure(figsize=(11,6))
prediction_svm=svm.predict(np.array(covid19_tn_model.index).reshape(-1,1))
fig=go.Figure()
fig.add_trace(go.Scatter(x=covid19_tn_model["date"], y=covid19_tn_model["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=covid19_tn_model["date"], y=prediction_svm,
mode='lines',name="Support Vector Machine Best fit Kernel",
line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Support Vectore Machine Regressor Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing
train_ml=covid19_tn_model.iloc[:int(covid19_tn_model.shape[0]*0.95)]
valid=covid19_tn_model.iloc[int(covid19_tn_model.shape[0]*0.95):]
y_pred=valid.copy()
holt=Holt(np.asarray(train_ml["case"])).fit(smoothing_level=0.4, smoothing_slope=0.4,optimized=False)
y_pred["Holt"]=holt.forecast(len(valid))
model_scores.append(np.sqrt(mean_squared_error(y_pred["case"],y_pred["Holt"])))
print("Root Mean Square Error Holt's Linear Model: ",np.sqrt(mean_squared_error(y_pred["case"],y_pred["Holt"])))
fig=go.Figure()
fig.add_trace(go.Scatter(x=covid19_tn_model.index, y=covid19_tn_model["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["case"],
mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["Holt"],
mode='lines+markers',name="Prediction of Confirmed Cases",))
fig.update_layout(title="Confirmed Cases Holt's Linear Model Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
from pmdarima.arima import auto_arima
import pmdarima as pm
#pip install pmdarima ----- try this
model_train=covid19_tn_model.iloc[:int(covid19_tn_model.shape[0]*0.95)]
valid=covid19_tn_model.iloc[int(covid19_tn_model.shape[0]*0.95):]
y_pred=valid.copy()
model_ar= auto_arima(covid19_tn_model["case"],trace=True, error_action='ignore', start_p=0,start_q=0,max_p=4,max_q=0,
suppress_warnings=True,stepwise=False,seasonal=False)
model_ar.fit(covid19_tn_model["case"])
prediction_ar=model_ar.predict(len(valid))
y_pred["AR Model Prediction"]=prediction_ar
model_scores.append(np.sqrt(mean_squared_error(y_pred["case"],y_pred["AR Model Prediction"])))
print("Root Mean Square Error for AR Model: ",np.sqrt(mean_squared_error(y_pred["case"],y_pred["AR Model Prediction"])))
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["case"],
mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["AR Model Prediction"],
mode='lines+markers',name="Prediction of Confirmed Cases",))
fig.update_layout(title="Confirmed Cases AR Model Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
model_train=covid19_tn_model.iloc[:int(covid19_tn_model.shape[0]*0.95)]
valid=covid19_tn_model.iloc[int(covid19_tn_model.shape[0]*0.95):]
y_pred=valid.copy()
model_ma= auto_arima(model_train["case"],trace=True, error_action='ignore', start_p=0,start_q=0,max_p=0,max_q=2,
suppress_warnings=True,stepwise=False,seasonal=False)
model_ma.fit(model_train["case"])
prediction_ma=model_ma.predict(len(valid))
y_pred["MA Model Prediction"]=prediction_ma
model_scores.append(np.sqrt(mean_squared_error(valid["case"],prediction_ma)))
print("Root Mean Square Error for MA Model: ",np.sqrt(mean_squared_error(valid["case"],prediction_ma)))
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["case"],
mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["MA Model Prediction"],
mode='lines+markers',name="Prediction for Confirmed Cases",))
fig.update_layout(title="Confirmed Cases MA Model Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
model_train=covid19_tn_model.iloc[:int(covid19_tn_model.shape[0]*0.95)]
valid=covid19_tn_model.iloc[int(covid19_tn_model.shape[0]*0.95):]
y_pred=valid.copy()
model_arima= auto_arima(model_train["case"],trace=True, error_action='ignore', start_p=1,start_q=1,max_p=3,max_q=3,
suppress_warnings=True,stepwise=False,seasonal=False)
model_arima.fit(model_train["case"])
prediction_arima=model_arima.predict(len(valid))
y_pred["ARIMA Model Prediction"]=prediction_arima
model_scores.append(np.sqrt(mean_squared_error(valid["case"],prediction_arima)))
print("Root Mean Square Error for ARIMA Model: ",np.sqrt(mean_squared_error(valid["case"],prediction_arima)))
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["case"],
mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["ARIMA Model Prediction"],
mode='lines+markers',name="Prediction for Confirmed Cases",))
fig.update_layout(title="Confirmed Cases ARIMA Model Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
model_sarima= auto_arima(model_train["case"],trace=True, error_action='ignore',
start_p=0,start_q=0,max_p=2,max_q=2,m=7,
suppress_warnings=True,stepwise=True,seasonal=True)
model_sarima.fit(model_train["case"])
prediction_sarima=model_sarima.predict(len(valid))
y_pred["SARIMA Model Prediction"]=prediction_sarima
model_scores.append(np.sqrt(mean_squared_error(y_pred["case"],y_pred["SARIMA Model Prediction"])))
print("Root Mean Square Error for SARIMA Model: ",np.sqrt(mean_squared_error(y_pred["case"],y_pred["SARIMA Model Prediction"])))
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["case"],
mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["case"],
mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["SARIMA Model Prediction"],
mode='lines+markers',name="Prediction for Confirmed Cases",))
fig.update_layout(title="Confirmed Cases SARIMA Model Prediction",
xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
SVM AND ARIMA ARE THE BEST MODELS